{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Make popular subset\n",
    "\n",
    "Creating a subset of ```titlemeta``` that is selected to maximize the ```copiesin25yrs``` column.\n",
    "\n",
    "This may not be \"popularity\" in an absolute sense; please don't take the name of the file in a literal-minded fashion.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.stats import pearsonr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "title = pd.read_csv('../titlemeta.tsv', sep = '\\t', index_col = 'docid', low_memory = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2100, 29)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "frames = []\n",
    "\n",
    "for floor in range(1800, 2010, 10):\n",
    "    decade = title.loc[(title.latestcomp >= floor) & (title.latestcomp < (floor + 10)), : ]\n",
    "    sample = decade.nlargest(100, 'copiesin25yrs')  \n",
    "    frames.append(sample)\n",
    "\n",
    "subset = pd.concat(frames)\n",
    "subset.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>oldauthor</th>\n",
       "      <th>author</th>\n",
       "      <th>authordate</th>\n",
       "      <th>inferreddate</th>\n",
       "      <th>latestcomp</th>\n",
       "      <th>datetype</th>\n",
       "      <th>startdate</th>\n",
       "      <th>enddate</th>\n",
       "      <th>imprint</th>\n",
       "      <th>imprintdate</th>\n",
       "      <th>...</th>\n",
       "      <th>allcopiesofwork</th>\n",
       "      <th>copiesin25yrs</th>\n",
       "      <th>enumcron</th>\n",
       "      <th>volnum</th>\n",
       "      <th>title</th>\n",
       "      <th>parttitle</th>\n",
       "      <th>earlyedition</th>\n",
       "      <th>shorttitle</th>\n",
       "      <th>nonficprob</th>\n",
       "      <th>juvenileprob</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>docid</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>hvd.rsmd2m</th>\n",
       "      <td>More, Hannah</td>\n",
       "      <td>More, Hannah</td>\n",
       "      <td>1745-1833.</td>\n",
       "      <td>1809</td>\n",
       "      <td>1809</td>\n",
       "      <td>s</td>\n",
       "      <td>1809</td>\n",
       "      <td></td>\n",
       "      <td>New-York;David Carlisle;1809.</td>\n",
       "      <td>1809</td>\n",
       "      <td>...</td>\n",
       "      <td>7.642857</td>\n",
       "      <td>7.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cœlebs in search of a wife : | comprehending o...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>Cœlebs in search of a wife : comprehending obs...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nyp.33433075814735</th>\n",
       "      <td>Madame Cottin, (Sophie)</td>\n",
       "      <td>Madame Cottin, (Sophie)</td>\n",
       "      <td>1770-1807.</td>\n",
       "      <td>1810</td>\n",
       "      <td>1807</td>\n",
       "      <td>s</td>\n",
       "      <td>1810</td>\n",
       "      <td></td>\n",
       "      <td>Poughkeepsie [N.Y.;Printed by Paraclete Potter...</td>\n",
       "      <td>1810</td>\n",
       "      <td>...</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>6.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Elizabeth, | or, The exiles of Siberia. A tale...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>Elizabeth, or, The exiles of Siberia. A tale, ...</td>\n",
       "      <td>0.231656</td>\n",
       "      <td>0.008057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mdp.39015014117546</th>\n",
       "      <td>Porter, Anna Maria</td>\n",
       "      <td>Porter, Anna Maria</td>\n",
       "      <td>1780-1832.</td>\n",
       "      <td>1807</td>\n",
       "      <td>1807</td>\n",
       "      <td>s</td>\n",
       "      <td>1807</td>\n",
       "      <td></td>\n",
       "      <td>London;Longman, Hurst, Rees, and Orme;1807.</td>\n",
       "      <td>1807</td>\n",
       "      <td>...</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.0</td>\n",
       "      <td>v.1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>The Hungarian brothers ... | $c: By Miss Anna ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>The Hungarian brothers</td>\n",
       "      <td>0.076883</td>\n",
       "      <td>0.616757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uiuo.ark+=13960=t2t449b4t</th>\n",
       "      <td>Porter, Anna Maria</td>\n",
       "      <td>Porter, Anna Maria</td>\n",
       "      <td>1780-1832.</td>\n",
       "      <td>1807</td>\n",
       "      <td>1807</td>\n",
       "      <td>s</td>\n",
       "      <td>1807</td>\n",
       "      <td></td>\n",
       "      <td>London;Longman, Hurst, Rees, and Orme;1807.</td>\n",
       "      <td>1807</td>\n",
       "      <td>...</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.0</td>\n",
       "      <td>v.2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>The Hungarian brothers</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>The Hungarian brothers</td>\n",
       "      <td>0.208313</td>\n",
       "      <td>0.158856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uiuo.ark+=13960=t3vt28143</th>\n",
       "      <td>Porter, Anna Maria</td>\n",
       "      <td>Porter, Anna Maria</td>\n",
       "      <td>1780-1832.</td>\n",
       "      <td>1807</td>\n",
       "      <td>1807</td>\n",
       "      <td>s</td>\n",
       "      <td>1807</td>\n",
       "      <td></td>\n",
       "      <td>London;Longman, Hurst, Rees, and Orme;1807.</td>\n",
       "      <td>1807</td>\n",
       "      <td>...</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.0</td>\n",
       "      <td>v.3</td>\n",
       "      <td>3.0</td>\n",
       "      <td>The Hungarian brothers</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>The Hungarian brothers</td>\n",
       "      <td>0.224745</td>\n",
       "      <td>0.221614</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         oldauthor                   author  \\\n",
       "docid                                                                         \n",
       "hvd.rsmd2m                            More, Hannah             More, Hannah   \n",
       "nyp.33433075814735         Madame Cottin, (Sophie)  Madame Cottin, (Sophie)   \n",
       "mdp.39015014117546              Porter, Anna Maria       Porter, Anna Maria   \n",
       "uiuo.ark+=13960=t2t449b4t       Porter, Anna Maria       Porter, Anna Maria   \n",
       "uiuo.ark+=13960=t3vt28143       Porter, Anna Maria       Porter, Anna Maria   \n",
       "\n",
       "                           authordate  inferreddate  latestcomp datetype  \\\n",
       "docid                                                                      \n",
       "hvd.rsmd2m                 1745-1833.          1809        1809        s   \n",
       "nyp.33433075814735         1770-1807.          1810        1807        s   \n",
       "mdp.39015014117546         1780-1832.          1807        1807        s   \n",
       "uiuo.ark+=13960=t2t449b4t  1780-1832.          1807        1807        s   \n",
       "uiuo.ark+=13960=t3vt28143  1780-1832.          1807        1807        s   \n",
       "\n",
       "                          startdate enddate  \\\n",
       "docid                                         \n",
       "hvd.rsmd2m                     1809           \n",
       "nyp.33433075814735             1810           \n",
       "mdp.39015014117546             1807           \n",
       "uiuo.ark+=13960=t2t449b4t      1807           \n",
       "uiuo.ark+=13960=t3vt28143      1807           \n",
       "\n",
       "                                                                     imprint  \\\n",
       "docid                                                                          \n",
       "hvd.rsmd2m                                     New-York;David Carlisle;1809.   \n",
       "nyp.33433075814735         Poughkeepsie [N.Y.;Printed by Paraclete Potter...   \n",
       "mdp.39015014117546               London;Longman, Hurst, Rees, and Orme;1807.   \n",
       "uiuo.ark+=13960=t2t449b4t        London;Longman, Hurst, Rees, and Orme;1807.   \n",
       "uiuo.ark+=13960=t3vt28143        London;Longman, Hurst, Rees, and Orme;1807.   \n",
       "\n",
       "                          imprintdate      ...      allcopiesofwork  \\\n",
       "docid                                      ...                        \n",
       "hvd.rsmd2m                       1809      ...             7.642857   \n",
       "nyp.33433075814735               1810      ...             8.000000   \n",
       "mdp.39015014117546               1807      ...             5.000000   \n",
       "uiuo.ark+=13960=t2t449b4t        1807      ...             5.000000   \n",
       "uiuo.ark+=13960=t3vt28143        1807      ...             5.000000   \n",
       "\n",
       "                          copiesin25yrs enumcron volnum  \\\n",
       "docid                                                     \n",
       "hvd.rsmd2m                          7.5      NaN    NaN   \n",
       "nyp.33433075814735                  6.0      NaN    NaN   \n",
       "mdp.39015014117546                  5.0      v.1    1.0   \n",
       "uiuo.ark+=13960=t2t449b4t           5.0      v.2    2.0   \n",
       "uiuo.ark+=13960=t3vt28143           5.0      v.3    3.0   \n",
       "\n",
       "                                                                       title  \\\n",
       "docid                                                                          \n",
       "hvd.rsmd2m                 Cœlebs in search of a wife : | comprehending o...   \n",
       "nyp.33433075814735         Elizabeth, | or, The exiles of Siberia. A tale...   \n",
       "mdp.39015014117546         The Hungarian brothers ... | $c: By Miss Anna ...   \n",
       "uiuo.ark+=13960=t2t449b4t                             The Hungarian brothers   \n",
       "uiuo.ark+=13960=t3vt28143                             The Hungarian brothers   \n",
       "\n",
       "                           parttitle earlyedition  \\\n",
       "docid                                               \n",
       "hvd.rsmd2m                       NaN         True   \n",
       "nyp.33433075814735               NaN         True   \n",
       "mdp.39015014117546               NaN         True   \n",
       "uiuo.ark+=13960=t2t449b4t        NaN         True   \n",
       "uiuo.ark+=13960=t3vt28143        NaN         True   \n",
       "\n",
       "                                                                  shorttitle  \\\n",
       "docid                                                                          \n",
       "hvd.rsmd2m                 Cœlebs in search of a wife : comprehending obs...   \n",
       "nyp.33433075814735         Elizabeth, or, The exiles of Siberia. A tale, ...   \n",
       "mdp.39015014117546                                    The Hungarian brothers   \n",
       "uiuo.ark+=13960=t2t449b4t                             The Hungarian brothers   \n",
       "uiuo.ark+=13960=t3vt28143                             The Hungarian brothers   \n",
       "\n",
       "                           nonficprob  juvenileprob  \n",
       "docid                                                \n",
       "hvd.rsmd2m                   0.000000      0.000000  \n",
       "nyp.33433075814735           0.231656      0.008057  \n",
       "mdp.39015014117546           0.076883      0.616757  \n",
       "uiuo.ark+=13960=t2t449b4t    0.208313      0.158856  \n",
       "uiuo.ark+=13960=t3vt28143    0.224745      0.221614  \n",
       "\n",
       "[5 rows x 29 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "subset.to_csv('most_popular_subset.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### validate the logic of ```copiesin25yrs```\n",
    "\n",
    "This version of ```most_popular_subset``` was created after experimentally recalculating our \"number of copies\" statistics. The earlier version of that calculation privileged multi-volume works. I want to confirm that our new calculation works better."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "oldsub = pd.read_csv('old_popular_subset.tsv', sep = '\\t', index_col = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.43246930423\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(0.32977201247571758, 1.5787079700343248e-38)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lengths = []\n",
    "copies = []\n",
    "\n",
    "for recordid, df in oldsub.groupby('recordid'):\n",
    "    lengths.append(len(df))\n",
    "    copies.append(np.mean(df.copiesin25yrs))\n",
    "\n",
    "print(np.mean(lengths))\n",
    "pearsonr(lengths, copies)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.1751538892\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(0.02460100982758159, 0.29862515699656245)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lengths = []\n",
    "copies = []\n",
    "\n",
    "for recordid, df in subset.groupby('recordid'):\n",
    "    lengths.append(len(df))\n",
    "    copies.append(np.mean(df.copiesin25yrs))\n",
    "\n",
    "print(np.mean(lengths))\n",
    "pearsonr(lengths, copies)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Success!** The average number of volumes is down, and the strength of correlation between num-vols and num-copies is down. This suggests that our recalculation has succeeded in addressing the problem that over-privileged multi-volume works."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
